# helper functions that are used across the project

# function to tokenize and return a document-feature matrix
make.dfm <- 
  function(data, text_col, doc_col, to_compound = NULL,
           stem = TRUE, ngrams = 1){
    require(quanteda)
    
    message("Making corpus")
    corp <- quanteda::corpus(data, text_field= text_col, docid_field= doc_col)
    
    message("Setting compounds...")
    
    message("Tokenizing...")
    tok <- quanteda::tokens(corp,
                            remove_punct=T,
                            remove_twitter=T, 
                            remove_url=T) %>%
      quanteda::tokens_remove("\\p{Z}", valuetype = "regex") %>%
      quanteda::tokens_remove("via", valuetype = "regex") %>%
      
      # remove stop words
      quanteda::tokens_remove(quanteda::stopwords(source = "snowball")) 
    
    if(!is.null(to_compound)){
      compounds <- quanteda::phrase(
        c("white house","fox news","united states",
          "united nations",
          "social media","climate change","right wing",
          to_compound)
      )
      
      tok <- tok %>%
        # concatenate common bigrams
        quanteda::tokens_compound(pattern = compounds)
    }
    
    if(stem == TRUE){
      # stem
      tok <- tok %>% quanteda::tokens_wordstem()
    }
    
    tok <- tok %>% tokens_ngrams(ngrams)
    
    message("Casting to dfm...")
    
    # send tokens to dfm
    bags_dfm <-  quanteda::dfm(tok,  verbose=T,tolower = T)
    
    return(list(dfm.out = bags_dfm,
                corp = corp))
  }

# function to return tweet IDs for tweets that contain keywords of interest
    # first makes dfm (see above), then flags documents with >0 uses of >0 keywords
get.ids <- function(dat, keywords, textcol, doccol){
  require(data.table)
  require(quanteda)
  dat <- data.table(dat)
  
  unlist(sapply(unique(dat$date), function(d){
    t.dfm <- make.dfm(data = dat[date == d],
                      text_col = textcol,
                      doc_col = doccol,
                      stem = FALSE,
                      ngrams = 1:3)$dfm.out
    
    cols <- which(colnames(t.dfm) %in% keywords)
    
    if(length(cols) < 1){
      return(NULL)
    }
    ids <- rownames(t.dfm)[rowSums(t.dfm[,cols]) > 0]
    return(ids)
  }))
}

# function to run pivot scaling
## note: the parrot package is not on CRAN
## to install, run devtools::install_github("wilryh/parrot", dependencies=TRUE)
## these analyses run using version 0.2.2
parrot_routine <- function(dat, keystring, subjname, subset = TRUE){
  require(stm)
  require(parrot)
  require(data.table)
  require(quanteda)
  
  if(subset == TRUE){
    message("Subsetting... \n")
    
    dates <- unique(lubridate::as_date(dat$datetime))
    
    relevant.ids <- get.ids(dat = dat, keywords = keystring, text_col = "text")
    
    sub <- dat[grep(keystring, tolower(text)),
               list(index, twitter_handle, tweetscore, left_right, inside_outside, datetime, text)] 
  }else{
    sub <- dat
  }
  
  sub <- data.table(sub)
  
  # remove hashtags and handles
  sub[,text := stringr::str_remove_all(text, "\\#([^ ]+)|\\@([^ ]+)")]
  
  # remove shortlinks
  sub[,text := stringr::str_remove_all(text, "\\S+t\\.co\\S+")]
  
  # remove regular links
  sub[,text := stringr::str_remove_all(text, "http\\S+\\s*")]
  
  # remove ascii (emojis, etc.)
  sub[,text := stringr::str_remove_all(text, "[^[:ascii:]]")]
  
  # remove ampersand word conversion
  sub[,text := stringr::str_remove_all(text, fixed("&amp"))]
  
  # remove other junk words
  sub[,text := stringr::str_remove_all(text, " via ")]

  message("Processing... \n")
  processed <- textProcessor(
    documents = sub$text,
    metadata = data.frame(sub),
    removestopwords=T, lowercase=T, stem=TRUE, removenumbers = TRUE,
    removepunctuation = TRUE
  )
  
  message("Prepping... \n")
  out <- prepDocuments(
    processed$documents, processed$vocab, processed$meta
  )
  
  message("Extracting embeddings.... \n")
  tdm <- doc_to_tdm(out)
  
  message("Calculating word scores... \n")
  scores <- scale_text(
    meta=out$meta,
    tdm=tdm,
    constrain_outliers = TRUE,
    pivot = 2
    ##    embeddings=embeddings[["meta"]], ## embeddings have little effect
    ##    on output -- if used, consider setting pivot lower (e.g. pivot = 1/2)
  )
  
  return(scores)
}

# alternate function to run pivot scaling with supplied document IDs (vs. identifying IDs in function)
## note: the parrot package is not on CRAN
## to install, run devtools::install_github("wilryh/parrot", dependencies=TRUE)
## these analyses run using version 0.2.2
parrot_routine_tokensub <- function(dat, sub.ids = NULL, 
                                    subjname, 
                                    pivot_power = 3,
                                    subset = TRUE){
  require(stm)
  require(parrot)
  require(data.table)
  require(quanteda)
  
  if(!is.null(sub.ids)){
    message("Subsetting... \n")
    
    sub <- dat[id %in% sub.ids] 
  }else{
    sub <- dat
  }
  
  sub <- data.table(sub)
  
  message("Processing... \n")
  processed <- textProcessor(
    documents = sub$text,
    metadata = data.frame(sub),
    removestopwords=T, lowercase=T, stem=TRUE, removenumbers = TRUE,
    removepunctuation = TRUE
  )
  
  message("Prepping... \n")
  out <- prepDocuments(
    processed$documents, processed$vocab, processed$meta
  )
  
  message("Extracting embeddings.... \n")
  tdm <- doc_to_tdm(out)
  
  message("Calculating word scores... \n")
  scores <- scale_text(
    meta=out$meta,
    tdm=tdm,
    constrain_outliers = TRUE,
    pivot = pivot_power,
    compress_fast = T,
    simple = F
    ##    embeddings=embeddings[["meta"]], ## embeddings have little effect
    ##    on output -- if used, consider setting pivot lower (e.g. pivot = 1/2)
  )
  
  return(scores)
}

# modified version of parrot::get_keywords to extract keywords for tables
get_keywords_custom <- function (scores, 
                                 n_dimensions, 
                                 n_words = 15, 
                                 stretch = 3, 
                                 capture_output = FALSE, 
                                 pivots_only = TRUE, topic) 
{
  all_keywords <- list()
  if (stretch%%2 != 1) 
    stop("Please enter odd integer for \"stretch\"")
  for (i in if (length(n_dimensions) == 1) {
    1:n_dimensions
  }
  else {
    n_dimensions
  }) {
    general_keywords <- scores$vocab[order(scores$pivot_scores[, 
                                                               i + 1] * sqrt(rowSums(scores$pivot_scores[, -1]^2)), 
                                           decreasing = TRUE)]
    specific_keywords <- scores$vocab[order(scores$word_scores[, 
                                                               i + 1]^(stretch) * sqrt(rowSums(scores$pivot_scores[, 
                                                                                                                   -1]^2)), decreasing = TRUE)]
    if (pivots_only) {
      keywords <- data.frame(head(rev(general_keywords), 
                                  n = n_words), head(general_keywords, n = n_words))
      names(keywords) <- c("pivots (-)", "(+) pivots")
    }
    else {
      keywords <- data.frame(head(rev(specific_keywords), 
                                  n = n_words), head(rev(general_keywords), n = n_words), 
                             head(general_keywords, n = n_words), head(specific_keywords, 
                                                                       n = n_words))
      names(keywords) <- c("scores (-)", "pivots (-)", 
                           "(+) pivots", "(+) scores")
    }
    if (capture_output) {
      all_keywords[[paste0("D", i)]] <- keywords
    }
    else {
      if (!requireNamespace("knitr", quietly = TRUE)) {
        cat("\nDimension", i, "keywords\n\n")
        print(keywords, row.names = F)
        cat("\n")
      }
      else {
        print(knitr::kable(keywords, align = "c", format = "pandoc", 
                           caption = paste("Dimension", i, "keywords: ", topic)))
        cat("\n")
      }
    }
  }
  if (capture_output) {
    return(all_keywords)
  }
}

# modified version of parrot::plot_keywords() to extract words for plotting
plot_keywords_custom <- function (scores, x_dimension = 1, y_dimension = 2, 
                                  q_cutoff = 0.9, 
                                  dims_cutoff = 0,
                                  mo = 50,
                                  plot_density = FALSE, unstretch = FALSE, color = FALSE,
                                  subjname = "subject"){
  require(ggrepel)
  if (unstretch) {
    scores$word_scores <- sweep(scores$word_scores, 1, 
                                sqrt(rowSums((scores$importance[-1] * scores$pivot_scores[, -1])^2)) + 1, `/`)
  }
  word_scores <- data.frame(scores$word_scores)
  word_counts <- scores$word_counts
  above_cutoff <- word_counts > quantile(word_counts, q_cutoff)
  x_dimension <- x_dimension + 1
  y_dimension <- y_dimension + 1
  
  xd <- word_scores[,x_dimension]
  yd <- word_scores[,y_dimension]
  
  if (color & !("color" %in% names(scores))) {
    scores$color <- factor(kmeans(scores$word_scores[, 2:11], 
                                  centers = 5)$cluster)
  }
  if (!color) {
    g <- ggplot2::ggplot() + 
      ggplot2::geom_point(data = word_scores[above_cutoff & (abs(xd) < dims_cutoff & abs(yd) < dims_cutoff),], 
                         ggplot2::aes(x = word_scores[above_cutoff & (abs(xd) < dims_cutoff & abs(yd) < dims_cutoff), x_dimension], 
                                      y = word_scores[above_cutoff & (abs(xd) < dims_cutoff & abs(yd) < dims_cutoff), y_dimension]),
                         alpha = .2) + 
      ggrepel::geom_text_repel(data = word_scores[above_cutoff & (abs(xd) > dims_cutoff | abs(yd) > dims_cutoff),], 
                         ggplot2::aes(x = word_scores[above_cutoff & (abs(xd) > dims_cutoff | abs(yd) > dims_cutoff), x_dimension], 
                                      y = word_scores[above_cutoff & (abs(xd) > dims_cutoff | abs(yd) > dims_cutoff), y_dimension], 
                                      label = scores$vocab[above_cutoff & (abs(xd) > dims_cutoff | abs(yd) > dims_cutoff)]),
                              max.overlaps = mo) + 
      ggplot2::xlab(paste("Dimension:", x_dimension - 1)) + 
      ggplot2::ylab(paste("Dimension:", y_dimension - 1)) + 
      ggplot2::guides(size = F) + 
      ggplot2::theme_classic() + 
      ggplot2::xlim(-max(abs(word_scores[above_cutoff,  x_dimension])), 
                    max(abs(word_scores[above_cutoff, x_dimension]))) + 
      ggplot2::ylim(-max(abs(word_scores[above_cutoff, y_dimension])), 
                    max(abs(word_scores[above_cutoff, y_dimension])))+
      ggplot2::ggtitle(paste0("Top two dimensions in pundits' tweets referencing ", subjname),
                       subtitle = paste0("Top ", 100*(1-q_cutoff), "% most common words"))
    
    if(abs(dims_cutoff) > 0){
      
      g <- g+
        labs(caption = paste0("Words with absolute value score >",abs(dims_cutoff)," on either dimension labeled, remainder in points"))
      
    }
    
  }
  else {
    g <- ggplot2::ggplot() + 
      ggplot2::geom_text(data = word_scores[above_cutoff, ], 
                         ggplot2::aes(x = word_scores[above_cutoff, x_dimension], 
                                      y = word_scores[above_cutoff, y_dimension], 
                                      label = scores$vocab[above_cutoff & (abs(xd) > dims_cutoff | abs(yd) > dims_cutoff)]), 
                                      color = scores$color[above_cutoff]) + 
      ggplot2::xlab(paste("Dimension:", x_dimension - 1)) + 
      ggplot2::ylab(paste("Dimension:", y_dimension - 1)) + 
      ggplot2::guides(size = F, color = F) + ggplot2::theme_classic() + 
      ggplot2::xlim(-max(abs(word_scores[above_cutoff, 
                                         x_dimension])), 
                    max(abs(word_scores[above_cutoff, x_dimension]))) + 
      ggplot2::ylim(-max(abs(word_scores[above_cutoff, y_dimension])), 
                    max(abs(word_scores[above_cutoff,  y_dimension])))+
      ggplot2::ggtitle(paste0("Top two dimensions in pundits' tweets referencing ", subjname),
                       subtitle = paste0("Top ", 100*(1-q_cutoff), "% most common words shown"))
  }
  if (!plot_density) {
    return(g)
  }
  else {
    gridExtra::grid.arrange(g, ggplot2::ggplot() + 
                              ggplot2::geom_density(ggplot2::aes(x = word_scores[, x_dimension])) +
                              ggplot2::xlab(paste("Dimension:", x_dimension - 1)) + 
                              ggplot2::theme_classic(), 
                            ggplot2::ggplot() + 
                              ggplot2::geom_density(ggplot2::aes(x = word_scores[,  y_dimension])) + 
                              ggplot2::xlab(paste("Dimension",  y_dimension - 1)) + 
                              ggplot2::theme_classic(), 
                            layout_matrix = rbind(c(1, 1, 2), c(1, 1, 3)))
  }
}

# function to cast text to a document-feature matrix
just.make.dfm <- function(data, text_col, doc_col, to_compound,
                            stem = TRUE){
  require(quanteda)
  
  message("Making corpus")
  corp <- quanteda::corpus(data, text_field= text_col, docid_field= doc_col)
  
  message("Setting compounds...")
  
  compounds <- quanteda::phrase(
    c("white house","fox news","united states",
      "united_nations",
      "social media","climate change","right wing",
      to_compound)
  )
  
  message("Tokenizing...")
  tok <- quanteda::tokens(corp,
                          remove_punct=T,
                          remove_twitter=T, 
                          remove_url=T) %>%
    quanteda::tokens_remove("\\p{Z}", valuetype = "regex") %>%
    quanteda::tokens_remove("via", valuetype = "regex") %>%
    
    # remove stop words
    quanteda::tokens_remove(quanteda::stopwords(source = "snowball")) %>%
    
    # concatenate common bigrams
    quanteda::tokens_compound(pattern = compounds)
  
  if(stem == TRUE){
    # stem
    tok <- tok %>% quanteda::tokens_wordstem()
  }
  
  message("Casting to dfm...")
  
  # send tokens to dfm
  bags_dfm <-  quanteda::dfm(tok,  verbose=T,tolower = T)
  
  return(bags_dfm)
}

# function to flag rows in dfm based on dictionary terms in columns
dictionary_flag <- function(data, text_col, doc_col, to_compound, dictionary,
                            stem = TRUE){
  
  message("Making corpus")
  corp <- quanteda::corpus(data, text_field= text_col, docid_field= doc_col)
  
  message("Setting compounds...")
  
  compounds <- quanteda::phrase(
    c("white house","fox news","united states",
      "united_nations",
      "social media","climate change","right wing",
      to_compound)
  )
  
  message("Tokenizing...")
  tok <- quanteda::tokens(corp,
                          remove_punct=T,
                          remove_symbols=T, 
                          remove_twitter=T, 
                          remove_url=T) %>%
    quanteda::tokens_remove("\\p{Z}", valuetype = "regex") %>%
    quanteda::tokens_remove("via", valuetype = "regex") %>%
    
    # remove stop words
    quanteda::tokens_remove(quanteda::stopwords(source = "snowball")) %>%
    
    # concatenate common bigrams
    quanteda::tokens_compound(pattern = compounds)
  
  if(stem == TRUE){
    # stem
    tok <- tok %>% quanteda::tokens_wordstem()
  }
  
  message("Casting to dfm...")
  
  # send tokens to dfm
  bags_dfm <-  dfm(tok,  verbose=T,tolower = T)
  
  tcount <- rowSums(bags_dfm)
  
  message("Flagging docs in dictionary...")
  docs_to_return <- rownames(bags_dfm)[which(rowSums(bags_dfm[,which(colnames(bags_dfm) %in% dictionary)]) > 0)]
  
  return(list(token_count = tcount,
              docs_in_dictionary = docs_to_return))
}

## ggplot theme
theme_jg <- function(){

    theme_classic()+
    theme(
        text = element_text(family = "serif", size= 16),
        strip.text = element_text(face = "bold"),
        plot.title = element_text(face = "bold", size = 20)
    )

}